In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV

1.1a

In [41]:
path = "TelcomCustomer-Churn_1.csv"
telecom1 = pd.read_csv(path)
telecom1
Out[41]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes
2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes
3 7795-CFOCW Male 0 No No 45 No No phone service DSL Yes
4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No
... ... ... ... ... ... ... ... ... ... ...
7038 6840-RESVB Male 0 Yes Yes 24 Yes Yes DSL Yes
7039 2234-XADUH Female 0 Yes Yes 72 Yes Yes Fiber optic No
7040 4801-JZAZL Female 0 Yes Yes 11 No No phone service DSL Yes
7041 8361-LTMKD Male 1 Yes No 4 Yes Yes Fiber optic No
7042 3186-AJIEK Male 0 No No 66 Yes No Fiber optic Yes

7043 rows × 10 columns

1.1b

In [42]:
path = "TelcomCustomer-Churn_2.csv"
telecom2 = pd.read_csv(path)
telecom2
Out[42]:
customerID OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Yes No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE No Yes No No No One year No Mailed check 56.95 1889.5 No
2 3668-QPYBK Yes No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW No Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU No No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes
... ... ... ... ... ... ... ... ... ... ... ... ...
7038 6840-RESVB No Yes Yes Yes Yes One year Yes Mailed check 84.80 1990.5 No
7039 2234-XADUH Yes Yes No Yes Yes One year Yes Credit card (automatic) 103.20 7362.9 No
7040 4801-JZAZL No No No No No Month-to-month Yes Electronic check 29.60 346.45 No
7041 8361-LTMKD No No No No No Month-to-month Yes Mailed check 74.40 306.6 Yes
7042 3186-AJIEK No Yes Yes Yes Yes Two year Yes Bank transfer (automatic) 105.65 6844.5 No

7043 rows × 12 columns

1.1c

In [43]:
telecom = pd.merge(telecom1,telecom2,on = 'customerID', how='left')
telecom
Out[43]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No ... No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes ... Yes No No No One year No Mailed check 56.95 1889.5 No
2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes ... No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW Male 0 No No 45 No No phone service DSL Yes ... Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No ... No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7038 6840-RESVB Male 0 Yes Yes 24 Yes Yes DSL Yes ... Yes Yes Yes Yes One year Yes Mailed check 84.80 1990.5 No
7039 2234-XADUH Female 0 Yes Yes 72 Yes Yes Fiber optic No ... Yes No Yes Yes One year Yes Credit card (automatic) 103.20 7362.9 No
7040 4801-JZAZL Female 0 Yes Yes 11 No No phone service DSL Yes ... No No No No Month-to-month Yes Electronic check 29.60 346.45 No
7041 8361-LTMKD Male 1 Yes No 4 Yes Yes Fiber optic No ... No No No No Month-to-month Yes Mailed check 74.40 306.6 Yes
7042 3186-AJIEK Male 0 No No 66 Yes No Fiber optic Yes ... Yes Yes Yes Yes Two year Yes Bank transfer (automatic) 105.65 6844.5 No

7043 rows × 21 columns

1.1d

In [5]:
names1 = telecom1.columns.values.tolist()
names2 = telecom2.columns.values.tolist()
names3 = telecom.columns.values.tolist()
print(names1)
print(names2)
print(names3)
['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity']
['customerID', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']
['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

From the above we can say that all the columns are incorporated in the merged dataframe but the sum of number of columns in first and second dataframe is not equal to number of columns in merged dataframe as both of them contain same coulmn('customer ID') on which we merged.

1.2 a & b

In [6]:
telecom.isnull().sum()
Out[6]:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

Indicates that there were no missing values. However, this is done before converting continuous values to float(where the datatype is object even for continuous values).

In [7]:
telecom.dtypes
Out[7]:
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

On observing values in the dataset we can notice that SeniorCitizen, tenure, MonthlyCharges and TotalCharges are represented in numbers. But TotalCharges is of 'object' datatype. Hence, we need to convert it into 'float' as it is having continuous numeric value.

1.2b

In [8]:
telecom['TotalCharges']=pd.to_numeric(telecom['TotalCharges'],errors="coerce")
telecom.dtypes
Out[8]:
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object
In [9]:
telecom.isnull().sum()
Out[9]:
customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

TotalCharges have 11 missing values. As this number is negligable when compared to total number of values we can drop them.

In [10]:
telecom = telecom.dropna()
In [11]:
telecom.isnull().sum()
Out[11]:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

Now the data is clean and doesn't have any missing values.

In [12]:
telecom[['tenure', 'MonthlyCharges', 'TotalCharges']].describe()
Out[12]:
tenure MonthlyCharges TotalCharges
count 7032.000000 7032.000000 7032.000000
mean 32.421786 64.798208 2283.300441
std 24.545260 30.085974 2266.771362
min 1.000000 18.250000 18.800000
25% 9.000000 35.587500 401.450000
50% 29.000000 70.350000 1397.475000
75% 55.000000 89.862500 3794.737500
max 72.000000 118.750000 8684.800000

1.2 c

In [13]:
data_size = telecom.Churn.value_counts(sort=True)
colors = ['blue','orange']
labels = 'Yes', 'No'
explode = (0, 0.1) 
plt.pie(data_size, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=90,)
plt.title('Percentage of Churn in dataset')
plt.show()
In [14]:
data_size = telecom.gender.value_counts(sort=True)
colors = ['red','orange']
labels = 'Yes', 'No'
explode = (0, 0.1) 
plt.pie(data_size, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=90,)
plt.title('Percentage of males and females in dataset')
plt.show()
In [15]:
data_size = telecom.Partner.value_counts(sort=True)
colors = ['red','orange']
labels = 'Yes', 'No'
explode = (0, 0.1) 
plt.pie(data_size, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=90,)
plt.title('Percentage of persons having partner in dataset')
plt.show()
In [16]:
data_size = telecom.Dependents.value_counts(sort=True)
colors = ['blue','orange']
labels = 'Yes', 'No'
explode = (0, 0.1) 
plt.pie(data_size, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=90,)
plt.title('Percentage of people with dependents in dataset')
plt.show()
In [17]:
data_size = telecom.PhoneService.value_counts(sort=True)
colors = ['blue','orange']
labels = 'Yes', 'No'
explode = (0, 0.1) 
plt.pie(data_size, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=90,)
plt.title('Percentage of customers having PhoneService in dataset')
plt.show()

1-2d

Insights from pie-chart:-

-Only 26.6% of customers have churned out and 73.4% have stayed with the company.

-Data has almost equal number of males and females as customers.

-Almost half of the customers have partners.

-70.2% have dependents and 29.8% doesn't not have dependents.

-Approximately 10% customers do not have phoneservice and remaining 90% have phoneservice.

In [18]:
print(telecom.gender.value_counts())
print(telecom.SeniorCitizen.value_counts())
print(telecom.Partner.value_counts())
print(telecom.Dependents.value_counts())
print(telecom.PhoneService.value_counts())
print(telecom.MultipleLines.value_counts())
print(telecom.InternetService.value_counts())
print(telecom.OnlineSecurity.value_counts())
print(telecom.DeviceProtection.value_counts())
print(telecom.TechSupport.value_counts())
print(telecom.StreamingTV.value_counts())
print(telecom.StreamingMovies.value_counts())
print(telecom.Contract.value_counts())
print(telecom.PaperlessBilling.value_counts())
print(telecom.PaymentMethod.value_counts())
print(telecom.Churn.value_counts())
Male      3549
Female    3483
Name: gender, dtype: int64
0    5890
1    1142
Name: SeniorCitizen, dtype: int64
No     3639
Yes    3393
Name: Partner, dtype: int64
No     4933
Yes    2099
Name: Dependents, dtype: int64
Yes    6352
No      680
Name: PhoneService, dtype: int64
No                  3385
Yes                 2967
No phone service     680
Name: MultipleLines, dtype: int64
Fiber optic    3096
DSL            2416
No             1520
Name: InternetService, dtype: int64
No                     3497
Yes                    2015
No internet service    1520
Name: OnlineSecurity, dtype: int64
No                     3094
Yes                    2418
No internet service    1520
Name: DeviceProtection, dtype: int64
No                     3472
Yes                    2040
No internet service    1520
Name: TechSupport, dtype: int64
No                     2809
Yes                    2703
No internet service    1520
Name: StreamingTV, dtype: int64
No                     2781
Yes                    2731
No internet service    1520
Name: StreamingMovies, dtype: int64
Month-to-month    3875
Two year          1685
One year          1472
Name: Contract, dtype: int64
Yes    4168
No     2864
Name: PaperlessBilling, dtype: int64
Electronic check             2365
Mailed check                 1604
Bank transfer (automatic)    1542
Credit card (automatic)      1521
Name: PaymentMethod, dtype: int64
No     5163
Yes    1869
Name: Churn, dtype: int64

CustomerID can be dropped from the data

In [19]:
telecom.drop('customerID', axis=1, inplace=True)

1-2e

In [20]:
telecom = telecom.apply(LabelEncoder().fit_transform)
telecom.head()
Out[20]:
gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 0 0 1 0 0 0 1 0 0 2 0 0 0 0 0 1 2 142 74 0
1 1 0 0 0 33 1 0 0 2 0 2 0 0 0 1 0 3 497 3624 0
2 1 0 0 0 1 1 0 0 2 2 0 0 0 0 0 1 3 435 536 1
3 1 0 0 0 44 0 1 0 2 0 2 2 0 0 1 0 0 266 3570 0
4 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 2 728 674 1

1-2 f

In [21]:
x = telecom.drop('Churn', axis=1)
y = telecom.pop('Churn')
In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)

1-2 g

In [23]:
sc = StandardScaler()
x_train_sc = sc.fit_transform(x_train)
x_test_sc = sc.fit_transform(x_test)

1.3 a

In [24]:
pip install xgboost
Requirement already satisfied: xgboost in c:\users\pandu\anaconda3\lib\site-packages (1.7.1)
Requirement already satisfied: numpy in c:\users\pandu\anaconda3\lib\site-packages (from xgboost) (1.21.5)
Requirement already satisfied: scipy in c:\users\pandu\anaconda3\lib\site-packages (from xgboost) (1.7.3)
Note: you may need to restart the kernel to use updated packages.
In [25]:
import xgboost as xgb
In [26]:
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_fscore_support as score
In [27]:
# Printing list of parameters in XGBoost
In [28]:
xgboost = XGBClassifier()
xgboost.get_params()
Out[28]:
{'objective': 'binary:logistic',
 'use_label_encoder': None,
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}
In [29]:
D_train = xgb.DMatrix(x_train, label=y_train)
D_test = xgb.DMatrix(x_test, label=y_test)
In [30]:
param = {
    'eta': 0.3, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 3} 

steps = 20  
In [31]:
# Training the data on xgb
In [32]:
model = xgb.train(param, D_train, steps)
model = xgb.train(param, D_test, steps)
In [33]:
from sklearn.metrics import precision_score, recall_score, accuracy_score

preds = model.predict(D_train)
best_preds = np.asarray([np.argmax(line) for line in preds])

print("Precision on trained data = {}".format(precision_score(y_train, best_preds, average='macro')))
print("Recall on trained data = {}".format(recall_score(y_train, best_preds, average='macro')))
print("Accuracy on trained data = {}".format(accuracy_score(y_train, best_preds)))

preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])

print("Precision on test data = {}".format(precision_score(y_test, best_preds, average='macro')))
print("Recall on test data = {}".format(recall_score(y_test, best_preds, average='macro')))
print("Accuracy on test data = {}".format(accuracy_score(y_test, best_preds)))
Precision on trained data = 0.7432563972247626
Recall on trained data = 0.70874626293265
Accuracy on trained data = 0.7968
Precision on test data = 0.7961161619264083
Recall on test data = 0.7530393222153982
Accuracy on test data = 0.8351101634683724

Performance is better on test data than trained data

In [34]:
# RandomSearch
In [35]:
param_grid = { 
    
    "learning_rate": [0.0001,0.001, 0.01, 0.1, 1] ,
    "max_depth": range(3,21,3),
    "gamma": [i/10.0 for i in range(0,5)],
    "colsample_bytree": [i/10.0 for i in range(3,10)],
    "reg_alpha": [1e-5, 1e-2, 0.1, 1, 10, 100],
    "reg_lambda": [1e-5, 1e-2, 0.1, 1, 10, 100]}

scoring = ['recall']
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
In [36]:
random_search = RandomizedSearchCV(estimator=xgboost, 
                           param_distributions=param_grid, 
                           n_iter=48,
                           scoring=scoring, 
                           refit='recall', 
                           n_jobs=-1, 
                           cv=kfold, 
                           verbose=0)

random_result = random_search.fit(x_train, y_train)
random_result
Out[36]:
RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=0, shuffle=True),
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, feature_types=None,
                                           gamma=None, gpu_id=None,
                                           grow_policy=None,
                                           importanc...
                                           num_parallel_tree=None,
                                           predictor=None, random_state=None, ...),
                   n_iter=48, n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.3, 0.4, 0.5, 0.6,
                                                             0.7, 0.8, 0.9],
                                        'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
                                        'learning_rate': [0.0001, 0.001, 0.01,
                                                          0.1, 1],
                                        'max_depth': range(3, 21, 3),
                                        'reg_alpha': [1e-05, 0.01, 0.1, 1, 10,
                                                      100],
                                        'reg_lambda': [1e-05, 0.01, 0.1, 1, 10,
                                                       100]},
                   refit='recall', scoring=['recall'])
In [37]:
print(f'The best score is {random_result.best_score_:.4f}')
print(f'The best hyperparameters are {random_result.best_params_}')
The best score is 0.5383
The best hyperparameters are {'reg_lambda': 100, 'reg_alpha': 10, 'max_depth': 6, 'learning_rate': 1, 'gamma': 0.3, 'colsample_bytree': 0.4}

score is not good with randomsearch

In [38]:
# 1.3b
In [39]:
import pickle
from sklearn.datasets import load_digits
In [88]:
xgb_params = {
    'objective': 'binary:logistic',
    'reg_lambda': 0.8,
    'reg_alpha': 0.4,
    'max_depth': 10,
    'max_delta_step': 1,
}
clf = xgb.XGBClassifier(**xgb_params)
clf.fit(x, y, eval_metric='auc', verbose=True)

pickle.dump(clf, open("xgb_temp.pkl", "wb"))
clf2 = pickle.load(open("xgb_temp.pkl", "rb"))

assert np.allclose(clf.predict(x), clf2.predict(x))
print(clf2.get_xgb_params())
{'objective': 'binary:logistic', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'eval_metric': None, 'gamma': 0, 'gpu_id': -1, 'grow_policy': 'depthwise', 'interaction_constraints': '', 'learning_rate': 0.300000012, 'max_bin': 256, 'max_cat_threshold': 64, 'max_cat_to_onehot': 4, 'max_delta_step': 1, 'max_depth': 10, 'max_leaves': 0, 'min_child_weight': 1, 'monotone_constraints': '()', 'n_jobs': 0, 'num_parallel_tree': 1, 'predictor': 'auto', 'random_state': 0, 'reg_alpha': 0.4, 'reg_lambda': 0.8, 'sampling_method': 'uniform', 'scale_pos_weight': 1, 'subsample': 1, 'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': None}
In [99]:
# GridSearch
In [100]:
from sklearn.model_selection import GridSearchCV

clf = xgb.XGBClassifier()
parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
     "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
     "min_child_weight" : [ 1, 3, 5, 7 ],
     "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
     }

grid = GridSearchCV(clf,
                    parameters, n_jobs=4,
                    scoring="neg_log_loss",
                    cv=3)

grid.fit(x_train, y_train)
Out[100]:
GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=None, ...),
             n_jobs=4,
             param_grid={'colsample_bytree': [0.3, 0.4, 0.5, 0.7],
                         'eta': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
                         'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
                         'max_depth': [3, 4, 5, 6, 8, 10, 12, 15],
                         'min_child_weight': [1, 3, 5, 7]},
             scoring='neg_log_loss')
In [40]:
param = {
    'eta': 0.5, 
    'max_depth': 5,  
    'objective': 'multi:softprob',  
    'num_class': 6} 
In [41]:
model = xgb.train(param, D_train, steps)
model = xgb.train(param, D_test, steps)

Performance after tuning parameters

In [42]:
preds = model.predict(D_train)
best_preds = np.asarray([np.argmax(line) for line in preds])

print("Precision on trained data = {}".format(precision_score(y_train, best_preds, average='macro')))
print("Recall on trained data = {}".format(recall_score(y_train, best_preds, average='macro')))
print("Accuracy on trained data = {}".format(accuracy_score(y_train, best_preds)))

preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])

print("Precision on test data = {}".format(precision_score(y_test, best_preds, average='macro')))
print("Recall on test data = {}".format(recall_score(y_test, best_preds, average='macro')))
print("Accuracy on test data = {}".format(accuracy_score(y_test, best_preds)))
Precision on trained data = 0.7252854956096846
Recall on trained data = 0.7121855270536075
Accuracy on trained data = 0.7854222222222222
Precision on test data = 0.9308501184834124
Recall on test data = 0.919942205634557
Accuracy on test data = 0.9431414356787491

There is improvement in performance of test data after tuning parameters like 'eta', 'max_depth', 'num_class'.

Precision = 93%, Recall = 91.9%, Accuracy = 94%

PART - 2¶

In [ ]:
# Importing all necessary libraries
In [111]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objs as go
pyo.init_notebook_mode()
In [112]:
# As CSV file is already imported in previous step. Performing functions directly on merged dataframe
In [113]:
data_df = pd.merge(telecom1,telecom2,on = 'customerID', how='left')
data_df.head()
Out[113]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No ... No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes ... Yes No No No One year No Mailed check 56.95 1889.5 No
2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes ... No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW Male 0 No No 45 No No phone service DSL Yes ... Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No ... No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes

5 rows × 21 columns

In [114]:
def dataoveriew(df, message):
    print('Number of rows: ', df.shape[0])
    print("Number of Columns:", df.shape[1])
    print("Column names:")
    print(df.columns.tolist())
    print("Missing values:", df.isnull().sum().values.sum())
    print("Unique values:")
    print(df.nunique())

dataoveriew(data_df, 'Overview of the dataset')
Number of rows:  7043
Number of Columns: 21
Column names:
['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']
Missing values: 0
Unique values:
customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64

Creating a 'def' function that valuecounts and rounds of to percentage, formates categorical and numerical features for graph visualization of features

In [115]:
def bar(feature, df=data_df ):
    temp_df = df.groupby([feature, 'Churn']).size().reset_index()
    temp_df = temp_df.rename(columns={0:'Count'})
    value_counts_df = df[feature].value_counts().to_frame().reset_index()
    categories = [cat[1][0] for cat in value_counts_df.iterrows()]
    num_list = [num[1][1] for num in value_counts_df.iterrows()]
    div_list = [element / sum(num_list) for element in num_list]
    percentage = [round(element * 100,1) for element in div_list]

    def num_format(list_instance):
        formatted_str = ''
        for index,num in enumerate(list_instance):
            if index < len(list_instance)-2:
                formatted_str=formatted_str+f'{num}%, ' 
            elif index == len(list_instance)-2:
                formatted_str=formatted_str+f'{num}% & '
            else:
                formatted_str=formatted_str+f'{num}%'
        return formatted_str
    
    def str_format(list_instance):
        formatted_str = ''
        for index, cat in enumerate(list_instance):
            if index < len(list_instance)-2:
                formatted_str=formatted_str+f'{cat}, '
            elif index == len(list_instance)-2:
                formatted_str=formatted_str+f'{cat} & '
            else:
                formatted_str=formatted_str+f'{cat}'
        return formatted_str
    
    num_str = num_format(percentage)
    cat_str = str_format(categories)

    fig = px.bar(temp_df, x=feature, y='Count', color='Churn', title=f'Churn rate by {feature}', barmode="group", color_discrete_sequence=["green", "red"])
    fig.add_annotation(
                text=f'Value count of distribution of {cat_str} are<br>{num_str} percentage respectively.',
                align='left',
                showarrow=False,
                xref='paper',
                yref='paper',
                x=1.4,
                y=1.3,
                bordercolor='black',
                borderwidth=1)
    fig.update_layout(
        margin=dict(r=600),
    )

    return fig.show()

From the above function we can visualize features in bar graph

Bar graph for Demographic features:-

In [116]:
bar('gender')
bar('Partner')
bar('Dependents')

Insights from above Bar Graph:-

-Indicates that 939 females and 930 males were churned from the company.

-1200 customers not having partner were churned, 669 customers having partner are churned. Comparatively customers not having partner were churned more than customers having partner.

-Clearly churning is less with customers having dependents.

-Churning of SeniorCitizens is less.

-There’s a higher proportion of churn in younger customers, customers with no partners, and customers with no dependents.

In [117]:
data_df.loc[data_df.SeniorCitizen==0,'SeniorCitizen'] = "No"   
data_df.loc[data_df.SeniorCitizen==1,'SeniorCitizen'] = "Yes"  
bar('SeniorCitizen')

Exploring Payment features:-

In [118]:
bar('Contract')
bar('PaperlessBilling')
bar('PaymentMethod')

Insights:-

-The shorter the contract, the higher the churn rate.

-Churn Rate is higher for the customers who opted for paperless billing. About 59.2% of customers use paperless billing.

-Customers who pay with electronic checks are more likely to churn, and this kind of payment is more common than other payment types.

Exploring other features:-

In [119]:
bar('InternetService')
bar('OnlineSecurity')
bar('StreamingTV')
bar('StreamingMovies')

EXPLORING DATA :-

In [120]:
data_df.dtypes
Out[120]:
customerID           object
gender               object
SeniorCitizen        object
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

On observing values in the dataset we can notice that tenure, MonthlyCharges and TotalCharges are represented in numbers. But TotalCharges is of 'object' datatype. Hence, we need to convert it into 'float' as it is having continuous numeric value.

In [121]:
data_df['TotalCharges'] = pd.to_numeric(data_df['TotalCharges'],errors='coerce')

DATA PREPROCESSING:-

In [122]:
# Creating pre-processing function for identifying missing values and impute them, dropping unnecessary feature
In [123]:
def preprocessing(df, message):
    print("Missing values:", df.isnull().sum().values.sum())
    data_df['TotalCharges'] = data_df['TotalCharges'].fillna(data_df['TotalCharges'].median())
    data_df.drop(["customerID"],axis=1,inplace = True)
preprocessing(data_df, 'preprocessing')    
Missing values: 11

ENCODING CATEGORICAL FEATURES :-

In [124]:
def binary_map(feature):
    return feature.map({'Yes':1, 'No':0})
data_df['Churn'] = data_df[['Churn']].apply(binary_map)
data_df['gender'] = data_df['gender'].map({'Male':1, 'Female':0})
binary_list = ['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
data_df[binary_list] = data_df[binary_list].apply(binary_map)
data_df = pd.get_dummies(data_df, drop_first=True)
In [125]:
bin_df = pd.DataFrame()

bin_df['tenure_bins'] =  pd.qcut(data_df['tenure'], q=3, labels= ['low', 'medium', 'high'])
bin_df['MonthlyCharges_bins'] =  pd.qcut(data_df['MonthlyCharges'], q=3, labels= ['low', 'medium', 'high'])
bin_df['TotalCharges_bins'] =  pd.qcut(data_df['TotalCharges'], q=3, labels= ['low', 'medium', 'high'])
bin_df['Churn'] = data_df['Churn']

Plotting histogram for three numeric features of dataset :-

In [126]:
def hist(feature):
    group_df = data_df.groupby([feature, 'Churn']).size().reset_index()
    group_df = group_df.rename(columns={0: 'Count'})
    fig = px.histogram(group_df, x=feature, y='Count', color='Churn', marginal='box', title=f'Churn rate frequency to {feature} distribution', color_discrete_sequence=["green", "red"])
    fig.show()
In [127]:
hist('tenure')
hist('MonthlyCharges')
hist('TotalCharges')

CORRELATION :-

In [128]:
corr = data_df.corr()
fig = px.imshow(corr,width=1000, height=1000)
fig.show()
In [129]:
# Generalized Linear Model Regression
In [130]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
all_columns = [column.replace(" ", "_").replace("(", "_").replace(")", "_").replace("-", "_") for column in data_df.columns]
data_df.columns = all_columns
glm_columns = [e for e in all_columns if e not in ['customerID', 'Churn']]
glm_columns = ' + '.join(map(str, glm_columns))
glm_model = smf.glm(formula=f'Churn ~ {glm_columns}', data=data_df, family=sm.families.Binomial())
res = glm_model.fit()
print(res.summary())
                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                  Churn   No. Observations:                 7043
Model:                            GLM   Df Residuals:                     7019
Model Family:                Binomial   Df Model:                           23
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -2914.7
Date:                Sat, 10 Dec 2022   Deviance:                       5829.3
Time:                        19:40:06   Pearson chi2:                 8.04e+03
No. Iterations:                     7   Pseudo R-squ. (CS):             0.2807
Covariance Type:            nonrobust                                         
=========================================================================================================
                                            coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------
Intercept                                 0.8274      0.748      1.106      0.269      -0.639       2.294
gender                                   -0.0219      0.065     -0.338      0.736      -0.149       0.105
SeniorCitizen                             0.2151      0.085      2.545      0.011       0.049       0.381
Partner                                  -0.0027      0.078     -0.035      0.972      -0.155       0.150
Dependents                               -0.1538      0.090     -1.714      0.087      -0.330       0.022
tenure                                   -0.0594      0.006     -9.649      0.000      -0.071      -0.047
PhoneService                              0.5036      0.692      0.728      0.467      -0.852       1.860
PaperlessBilling                          0.3418      0.074      4.590      0.000       0.196       0.488
MonthlyCharges                           -0.0404      0.032     -1.272      0.203      -0.103       0.022
TotalCharges                              0.0003   7.01e-05      4.543      0.000       0.000       0.000
MultipleLines_No_phone_service            0.3238      0.106      3.061      0.002       0.116       0.531
MultipleLines_Yes                         0.4469      0.177      2.524      0.012       0.100       0.794
InternetService_Fiber_optic               1.7530      0.798      2.198      0.028       0.190       3.316
InternetService_No                       -0.2559      0.115     -2.220      0.026      -0.482      -0.030
OnlineSecurity_No_internet_service       -0.2559      0.115     -2.220      0.026      -0.482      -0.030
OnlineSecurity_Yes                       -0.2055      0.179     -1.150      0.250      -0.556       0.145
OnlineBackup_No_internet_service         -0.2559      0.115     -2.220      0.026      -0.482      -0.030
OnlineBackup_Yes                          0.0258      0.175      0.147      0.883      -0.318       0.369
DeviceProtection_No_internet_service     -0.2559      0.115     -2.220      0.026      -0.482      -0.030
DeviceProtection_Yes                      0.1477      0.176      0.838      0.402      -0.198       0.493
TechSupport_No_internet_service          -0.2559      0.115     -2.220      0.026      -0.482      -0.030
TechSupport_Yes                          -0.1789      0.180     -0.991      0.322      -0.533       0.175
StreamingTV_No_internet_service          -0.2559      0.115     -2.220      0.026      -0.482      -0.030
StreamingTV_Yes                           0.5912      0.326      1.813      0.070      -0.048       1.230
StreamingMovies_No_internet_service      -0.2559      0.115     -2.220      0.026      -0.482      -0.030
StreamingMovies_Yes                       0.6038      0.326      1.850      0.064      -0.036       1.244
Contract_One_year                        -0.6671      0.107     -6.208      0.000      -0.878      -0.456
Contract_Two_year                        -1.3896      0.176     -7.904      0.000      -1.734      -1.045
PaymentMethod_Credit_card__automatic_    -0.0865      0.114     -0.758      0.448      -0.310       0.137
PaymentMethod_Electronic_check            0.3057      0.094      3.236      0.001       0.121       0.491
PaymentMethod_Mailed_check               -0.0567      0.115     -0.493      0.622      -0.282       0.168
=========================================================================================================

Feature importance :-

In [131]:
np.exp(res.params)
Out[131]:
Intercept                                2.287343
gender                                   0.978355
SeniorCitizen                            1.239957
Partner                                  0.997312
Dependents                               0.857471
tenure                                   0.942322
PhoneService                             1.654668
PaperlessBilling                         1.407543
MonthlyCharges                           0.960432
TotalCharges                             1.000318
MultipleLines_No_phone_service           1.382358
MultipleLines_Yes                        1.563475
InternetService_Fiber_optic              5.771657
InternetService_No                       0.774257
OnlineSecurity_No_internet_service       0.774257
OnlineSecurity_Yes                       0.814269
OnlineBackup_No_internet_service         0.774257
OnlineBackup_Yes                         1.026127
DeviceProtection_No_internet_service     0.774257
DeviceProtection_Yes                     1.159152
TechSupport_No_internet_service          0.774257
TechSupport_Yes                          0.836193
StreamingTV_No_internet_service          0.774257
StreamingTV_Yes                          1.806134
StreamingMovies_No_internet_service      0.774257
StreamingMovies_Yes                      1.829067
Contract_One_year                        0.513185
Contract_Two_year                        0.249179
PaymentMethod_Credit_card__automatic_    0.917142
PaymentMethod_Electronic_check           1.357617
PaymentMethod_Mailed_check               0.944913
dtype: float64
In [132]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
data_df['tenure'] = sc.fit_transform(data_df[['tenure']])
data_df['MonthlyCharges'] = sc.fit_transform(data_df[['MonthlyCharges']])
data_df['TotalCharges'] = sc.fit_transform(data_df[['TotalCharges']])
In [133]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
In [134]:
def splitting(df, message):
    X = data_df.drop('Churn', axis=1)
    y = data_df['Churn']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)
splitting(data_df, 'splitting')    
    
In [145]:
# Creating a 'def' function to run any model by simply calling it by name:-
In [135]:
def modeling(alg, alg_name, params={}):
    model = alg(**params) 
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    def print_scores(alg, y_true, y_pred):
        print(alg_name)
        acc_score = accuracy_score(y_true, y_pred)
        print("accuracy: ",acc_score)
        pre_score = precision_score(y_true, y_pred)
        print("precision: ",pre_score)
        rec_score = recall_score(y_true, y_pred)
        print("recall: ",rec_score)
        f_score = f1_score(y_true, y_pred, average='weighted')
        print("f1_score: ",f_score)

    print_scores(alg, y_test, y_pred)
    return model
In [136]:
# Running logistic regression model
log_model = modeling(LogisticRegression, 'Logistic Regression')
Logistic Regression
accuracy:  0.7983909133932797
precision:  0.6281800391389433
recall:  0.5763016157989228
f1_score:  0.7955174819104321
In [137]:
# Feature selection to improve model building
In [138]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
log = LogisticRegression()
rfecv = RFECV(estimator=log, cv=StratifiedKFold(10, random_state=50, shuffle=True), scoring="accuracy")
rfecv.fit(X, y)
Out[138]:
RFECV(cv=StratifiedKFold(n_splits=10, random_state=50, shuffle=True),
      estimator=LogisticRegression(), scoring='accuracy')
In [139]:
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(rfecv.grid_scores_)+1), rfecv.grid_scores_)
plt.grid()
plt.xticks(range(1, X.shape[1]+1))
plt.xlabel("Number of Selected Features")
plt.ylabel("CV Score")
plt.title("Recursive Feature Elimination (RFE)")
plt.show()

print("The optimal number of features: {}".format(rfecv.n_features_))
The optimal number of features: 23

Trying other machine learning algorithms :-

In [146]:
# Running different models by simply calling modeling :-
In [140]:
# Running logistic regression model
log_model = modeling(LogisticRegression, 'Logistic Regression')
Logistic Regression
accuracy:  0.7983909133932797
precision:  0.6281800391389433
recall:  0.5763016157989228
f1_score:  0.7955174819104321
In [141]:
# SVC
svc_model = modeling(SVC, 'SVC Classification')
SVC Classification
accuracy:  0.795551348793185
precision:  0.6355748373101953
recall:  0.526032315978456
f1_score:  0.7889704158679894
In [142]:
# Random forest
rf_model = modeling(RandomForestClassifier, "Random Forest Classification")
Random Forest Classification
accuracy:  0.780407004259347
precision:  0.5995717344753747
recall:  0.5026929982046678
f1_score:  0.7738430191302584
In [143]:
# Decision tree
dt_model = modeling(DecisionTreeClassifier, "Decision Tree Classification")
Decision Tree Classification
accuracy:  0.7292948414576431
precision:  0.48776508972267535
recall:  0.5368043087971275
f1_score:  0.7332929726420037
In [144]:
# Naive bayes 
nb_model = modeling(GaussianNB, "Naive Bayes Classification")
Naive Bayes Classification
accuracy:  0.6469474680548982
precision:  0.42011834319526625
recall:  0.8922800718132855
f1_score:  0.6660052398768987

Among all models Logistic Regression shows best accuracy followed by SVC, Random forest, Decision tree and Naive bayes model respectively.

In [ ]:
# 'hist' and 'bar' were created for graphical visualization of certain features

Summary of all 'def' functions with which we can simply call a model by its name :-

In [158]:
dataoveriew # Overview of data
bar # Formatting string and numerical values
preprocessing # Preparation of data for performing models
binary_map # Encoding variables
splitting # Splitting data into 'x' and 'y'
model = modeling(LogisticRegression, 'Logistic Regression') # Simply calling model with best performance
Logistic Regression
accuracy:  0.7983909133932797
precision:  0.6281800391389433
recall:  0.5763016157989228
f1_score:  0.7955174819104321